{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Credit Card Fraud Detection\n",
    "### Based on the Kaggle [Fraud Detection Data](https://www.kaggle.com/code/zwhjorth/dnn-svm-and-dt-for-fraud-detection)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Install all requirements needed to train this model and track it in MLFlow.\n",
    "!pip install pip -qU\n",
    "!pip install -r requirements.txt -q"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Import the dependencies we need to run the code.\n",
    "\n",
    "import os\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from keras.models import Sequential\n",
    "from keras.layers import Dense, Dropout, BatchNormalization, Activation\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "from sklearn.utils import class_weight\n",
    "from sklearn.metrics import confusion_matrix\n",
    "from matplotlib import pyplot as plt\n",
    "import seaborn as sns\n",
    "import tf2onnx\n",
    "\n",
    "import mlflow\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "\n",
    "import keras"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Get the external route for the MLFlow Server"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Get the environment variable for how to reach MLFlow.\n",
    "# The general format is http://<route-to-mlflow>:<port> where you replace <route-to-mlflow> and <port> with specifics for where your MLFlow instance is set up and/or exposed.\n",
    "\n",
    "MLFLOW_ROUTE = os.getenv(\"MLFLOW_ROUTE\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Load the data into a pandas dataframe."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load the CSV data which we will use to train the model.\n",
    "# It contains the following fields:\n",
    "#   distancefromhome - The distance from home where the transaction happened.\n",
    "#   distancefromlast_transaction - The distance from last transaction happened.\n",
    "#   ratiotomedianpurchaseprice - Ratio of purchased price compared to median purchase price.\n",
    "#   repeat_retailer - If it's from a retailer that already has been purchased from before.\n",
    "#   used_chip - If the (credit card) chip was used.\n",
    "#   usedpinnumber - If the PIN number was used.\n",
    "#   online_order - If it was an online order.\n",
    "#   fraud - If the transaction is fraudulent.\n",
    "\n",
    "Data = pd.read_csv('../data/card_transdata.csv')\n",
    "Data.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Split the data into training and test sets."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Set the input (X) and output (Y) data. \n",
    "# The only output data we have is if it's fraudulent or not, and all other fields go as inputs to the model.\n",
    "\n",
    "X = Data.drop(columns = ['fraud'])\n",
    "y = Data['fraud']\n",
    "\n",
    "# Split the data into training and testing sets so we have something to test the trained model with.\n",
    "\n",
    "X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, stratify = y)\n",
    "\n",
    "X_train, X_val, y_train, y_val = train_test_split(X_train,y_train, test_size = 0.2, stratify = y_train)\n",
    "\n",
    "# Scale the data to remove mean and have unit variance. This means that the data will be between -1 and 1, which makes it a lot easier for the model to learn than random potentially large values.\n",
    "# It is important to only fit the scaler to the training data, otherwise you are leaking information about the global distribution of variables (which is influenced by the test set) into the training set.\n",
    "\n",
    "scaler = StandardScaler()\n",
    "\n",
    "X_train = scaler.fit_transform(X_train)\n",
    "\n",
    "# Since the dataset is unbalanced (it has many more non-fraud transactions than fraudulent ones), we set a class weight to weight the few fraudulent transactions higher than the many non-fraud transactions.\n",
    "\n",
    "class_weights = class_weight.compute_class_weight('balanced',classes = np.unique(y_train),y = y_train)\n",
    "class_weights = {i : class_weights[i] for i in range(len(class_weights))}"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Build the DNN model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Build the model, the model we build here is a simple fully connected deep neural network, containing 3 hidden layers and one output layer.\n",
    "\n",
    "model = Sequential()\n",
    "model.add(Dense(32, name='dense', activation = 'relu', input_dim = len(X.columns)))\n",
    "model.add(Dropout(0.2))\n",
    "model.add(Dense(32, name='dense_02'))\n",
    "model.add(BatchNormalization())\n",
    "model.add(Activation('relu'))\n",
    "model.add(Dropout(0.2))\n",
    "model.add(Dense(32, name='dense_03'))\n",
    "model.add(BatchNormalization())\n",
    "model.add(Activation('relu'))\n",
    "model.add(Dropout(0.2))\n",
    "model.add(Dense(1, name='dense_04', activation = 'sigmoid'))\n",
    "model.compile(optimizer='SGD',loss='binary_crossentropy',metrics=['accuracy'])\n",
    "model.summary()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Configure MLFlow"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Set up tracking to MLFlow by using the route from earlier. We call the experiment and model DNN-credit-fraud.\n",
    "# We also enable autologging for MLFlow so we don't have to manually log the model or any metrics besides custom ones.\n",
    "\n",
    "mlflow.set_tracking_uri(MLFLOW_ROUTE)\n",
    "mlflow.set_experiment(\"DNN-credit-card-fraud\")\n",
    "mlflow.tensorflow.autolog(registered_model_name=\"DNN-credit-card-fraud\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Train the model, plot the confusion matrix and push the artifacts to MLFlow."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Train the model.\n",
    "# We wrap the training with an mlflow wrapper to signify that this is an experiment run.\n",
    "# We also define a few more metrics at the very bottom to track the confusion matrix in MLFlow.\n",
    "\n",
    "with mlflow.start_run():\n",
    "    epochs = 2\n",
    "    history = model.fit(X_train, y_train, epochs=epochs, \\\n",
    "                        validation_data=(scaler.transform(X_val),y_val), \\\n",
    "                        verbose = True, class_weight = class_weights)\n",
    "\n",
    "    y_pred_temp = model.predict(scaler.transform(X_test)) \n",
    "\n",
    "    threshold = 0.995\n",
    "\n",
    "    y_pred = np.where(y_pred_temp > threshold, 1,0)\n",
    "    c_matrix = confusion_matrix(y_test,y_pred)\n",
    "    ax = sns.heatmap(c_matrix, annot=True, cbar=False, cmap='Blues')\n",
    "    ax.set_xlabel(\"Prediction\")\n",
    "    ax.set_ylabel(\"Actual\")\n",
    "    ax.set_title('Confusion Matrix')\n",
    "    plt.show()\n",
    "\n",
    "    t_n, f_p, f_n, t_p = c_matrix.ravel()\n",
    "    mlflow.log_metric(\"tn\", t_n)\n",
    "    mlflow.log_metric(\"fp\", f_p)\n",
    "    mlflow.log_metric(\"fn\", f_n)\n",
    "    mlflow.log_metric(\"tp\", t_p)\n",
    "\n",
    "    model_proto,_ = tf2onnx.convert.from_keras(model)\n",
    "    mlflow.onnx.log_model(model_proto, \"models\")\n",
    "    "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Save the model locally in ONNX format."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import onnx\n",
    "onnx.save(model_proto, \"fraud.onnx\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Save the model locally in tensorflow's **saved_model** format. This is useful for working with different model servers such as Triton."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "keras.models.save_model(model, filepath=\"fraud/1\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3.9.14",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.14"
  },
  "vscode": {
   "interpreter": {
    "hash": "1634c0bc43905e7916bfdb805d9fa90ddc101c0f948f75bff344e1199ec8d02f"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
